file = '../raw_data/full_dump.json'
with open(file) as data_file:
data = json.load(data_file)
data_keys = list(data.keys())
random.sample(data_keys, 5)
['NA Academy League/2018 Season/Summer Playoffs/Scoreboards/Semifinals and Finals_3_4', 'Circuito de Leyendas Sur/2017 Season/Opening Season/Scoreboards/Week 6_4_2', 'Challengers Korea/2020 Season/Summer Season/Scoreboards/Week 7_1_2', 'Ultraliga/Season 5/Scoreboards/Week 6_6_1', '2016 International Wildcard Qualifier/Scoreboards/4-6_6_1']
f'input data: {len(data)}'
'input data: 35320'
#Normalize means to expand the nested keys in the JSON file and the subsequent DataFrame.
##Normalize the complete dataframe and make a copy
df_normalized_teams = pd.json_normalize(data.values())
df_teams = df_normalized_teams.copy()
df_teams.sample(2)
| id | start | patch | winner | duration | picks_bans | teams.BLUE.name | teams.BLUE.total_turret_kills | teams.BLUE.total_inhibitor_kills | teams.BLUE.total_rift_herald_kills | ... | teams.BLUE.side | teams.BLUE.players | teams.RED.name | teams.RED.total_turret_kills | teams.RED.total_inhibitor_kills | teams.RED.total_rift_herald_kills | teams.RED.total_dragon_kills | teams.RED.total_baron_kills | teams.RED.side | teams.RED.players | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35189 | CLS/2017 Season/Preseason Tournament/Scoreboar... | None | 6.21 | RED | 2104 | [] | Rebirth eSports | 3 | 0 | 0 | ... | BLUE | [{'name': 'MANTARRAYA', 'id': 180979, 'role': ... | Last Kings | 11 | 3 | 0 | 4 | 2 | RED | [{'name': 'Nipphu', 'id': 185761, 'role': 'TOP... |
| 14850 | LCS/2019 Season/Summer Season/Scoreboards/Week... | None | 9.12 | BLUE | 1690 | [{'champion_name': 'Rumble', 'is_ban': True}, ... | Cloud9 | 11 | 3 | 1 | ... | BLUE | [{'name': 'Kumo', 'id': 214202, 'role': 'TOP',... | Clutch Gaming | 0 | 0 | 0 | 1 | 0 | RED | [{'name': 'Huni', 'id': 165378, 'role': 'TOP',... |
2 rows × 22 columns
def get_patch_year(s):
return int(s.split('.')[0]) + 2010
#return the year the game was played
df_teams['year'] = df_teams.patch.apply(get_patch_year)
df_teams[['patch', 'year']].sample(2)
| patch | year | |
|---|---|---|
| 17489 | 9.19 | 2019 |
| 27693 | 11.2 | 2021 |
#Make a dataframe dedicated to the team BLUE
df_normalized_BLUE = pd.json_normalize(df_normalized_teams['teams.BLUE.players'].explode())
df_BLUE = df_normalized_BLUE.copy()
df_BLUE.head(5)
| name | id | role | champion_name | champion_id | gold_15 | kills_assists_15 | deaths_15 | total_gold | total_cs | total_kills | total_monster_kills | total_assists | total_deaths | total_damage_taken | total_damage_dealt | win | side | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Kikis | 172122.0 | TOP | Nautilus | 111 | 5326.0 | 8.0 | 1.0 | 12504 | 233 | 2 | 0 | 6 | 1 | 18220 | 156270 | True | BLUE |
| 1 | Broxah | 193072.0 | JGL | Lee Sin | 64 | 5261.0 | 11.0 | 1.0 | 12352 | 153 | 4 | 132 | 7 | 1 | 22212 | 152183 | True | BLUE |
| 2 | Nisqy | 185791.0 | MID | Syndra | 134 | 6009.0 | 13.0 | 1.0 | 13393 | 229 | 5 | 6 | 8 | 1 | 10647 | 177681 | True | BLUE |
| 3 | MrRallez | 183407.0 | BOT | Jhin | 202 | 5304.0 | 11.0 | 0.0 | 13969 | 315 | 2 | 5 | 9 | 0 | 9758 | 228328 | True | BLUE |
| 4 | Klaj | 171882.0 | SUP | Karma | 43 | 2767.0 | 12.0 | 0.0 | 9740 | 38 | 1 | 0 | 11 | 0 | 11917 | 34299 | True | BLUE |
#Make a dataframe dedicated to the team RED
df_normalized_RED = pd.json_normalize(df_normalized_teams['teams.RED.players'].explode())
df_RED = df_normalized_RED.copy()
df_RED.head(5)
| name | id | role | champion_name | champion_id | gold_15 | kills_assists_15 | deaths_15 | total_gold | total_cs | total_kills | total_monster_kills | total_assists | total_deaths | total_damage_taken | total_damage_dealt | win | side | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Phones | 193289.0 | TOP | Maokai | 57 | 4528.0 | 2.0 | 7.0 | 9611 | 190 | 1 | 13 | 1 | 7 | 41065 | 111536 | False | RED |
| 1 | Obvious | 187241.0 | JGL | Rengar | 107 | 4728.0 | 2.0 | 1.0 | 9640 | 174 | 0 | 126 | 2 | 1 | 27879 | 147035 | False | RED |
| 2 | MagiFelix | 181359.0 | MID | Ryze | 13 | 4893.0 | 2.0 | 3.0 | 11840 | 301 | 0 | 2 | 2 | 3 | 16013 | 205899 | False | RED |
| 3 | Sedrion | 197437.0 | BOT | Varus | 110 | 5133.0 | 2.0 | 1.0 | 12010 | 283 | 2 | 1 | 0 | 1 | 10370 | 184927 | False | RED |
| 4 | Noxiak | 185879.0 | SUP | Nami | 267 | 2521.0 | 2.0 | 2.0 | 7348 | 16 | 0 | 0 | 2 | 2 | 13815 | 15418 | False | RED |
get_index = df_normalized_teams['id'].tolist()
index_preproc = np.asarray([[index] * 5 for index in get_index])
index_teams = index_preproc.reshape(len(df_normalized_teams) * 5).tolist()
df_RED['game_id'] = index_teams
df_BLUE['game_id'] = index_teams
df_BLUE[df_BLUE['game_id'] == '2016 International Wildcard Invitational/Scoreboards/Bracket Stage_1_1']
| name | id | role | champion_name | champion_id | gold_15 | kills_assists_15 | deaths_15 | total_gold | total_cs | total_kills | total_monster_kills | total_assists | total_deaths | total_damage_taken | total_damage_dealt | win | side | game_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 165535 | Smurf | 197966.0 | TOP | Trundle | 48 | 5175.0 | 8.0 | 3.0 | 15494 | 294 | 0 | 15 | 8 | 3 | 36337 | 241203 | True | BLUE | 2016 International Wildcard Invitational/Score... |
| 165536 | PvPStejos | 194522.0 | JGL | Graves | 104 | 4865.0 | 11.0 | 2.0 | 16049 | 177 | 6 | 148 | 5 | 2 | 25664 | 231200 | True | BLUE | 2016 International Wildcard Invitational/Score... |
| 165537 | Kira | 172113.0 | MID | Lissandra | 127 | 5533.0 | 14.0 | 2.0 | 16549 | 325 | 3 | 25 | 11 | 2 | 25513 | 252607 | True | BLUE | 2016 International Wildcard Invitational/Score... |
| 165538 | Onesh0tiq | 188541.0 | BOT | Lucian | 236 | 5722.0 | 13.0 | 1.0 | 18339 | 356 | 6 | 15 | 7 | 1 | 15931 | 251707 | True | BLUE | 2016 International Wildcard Invitational/Score... |
| 165539 | Likkrit | 179739.0 | SUP | Tahm Kench | 223 | 3386.0 | 10.0 | 2.0 | 12309 | 80 | 1 | 6 | 9 | 2 | 23811 | 56257 | True | BLUE | 2016 International Wildcard Invitational/Score... |
df_RED[df_RED['game_id'] == '2016 International Wildcard Invitational/Scoreboards/Bracket Stage_1_1']
| name | id | role | champion_name | champion_id | gold_15 | kills_assists_15 | deaths_15 | total_gold | total_cs | total_kills | total_monster_kills | total_assists | total_deaths | total_damage_taken | total_damage_dealt | win | side | game_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 165535 | Yang | 205635.0 | TOP | Maokai | 57 | 4891.0 | 9.0 | 2.0 | 12994 | 260 | 1 | 12 | 8 | 2 | 26978 | 182014 | False | RED | 2016 International Wildcard Invitational/Score... |
| 165536 | Revolta | 195157.0 | JGL | Kindred | 203 | 4730.0 | 6.0 | 3.0 | 12638 | 178 | 1 | 122 | 5 | 3 | 27564 | 195085 | False | RED | 2016 International Wildcard Invitational/Score... |
| 165537 | tockers | 201599.0 | MID | Ekko | 245 | 5159.0 | 9.0 | 2.0 | 13866 | 285 | 5 | 10 | 4 | 2 | 31698 | 215422 | False | RED | 2016 International Wildcard Invitational/Score... |
| 165538 | micaO | 182405.0 | BOT | Jinx | 222 | 5994.0 | 9.0 | 5.0 | 15322 | 356 | 3 | 14 | 6 | 5 | 19697 | 249792 | False | RED | 2016 International Wildcard Invitational/Score... |
| 165539 | Jockster | 169596.0 | SUP | Thresh | 412 | 3155.0 | 6.0 | 4.0 | 8906 | 36 | 0 | 0 | 6 | 4 | 18449 | 20745 | False | RED | 2016 International Wildcard Invitational/Score... |
#Format the data into a df that simulates the one we will received from the web
df_blue = rearrange_df(df_BLUE)
df_red = rearrange_df(df_RED)
data = pd.merge(df_blue, df_red, on='game_id')
df_teams['game_id'] = df_teams['id']
#For the blue side the champions are indexed as _x, red side champions are indexed as _y
full_data = pd.merge(data, df_teams[['winner', 'game_id']], on='game_id')
full_data
| TOP_x | game_id | JGL_x | BOT_x | MID_x | SUP_x | TOP_y | JGL_y | BOT_y | MID_y | SUP_y | winner | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 111 | EU Challenger Series/2017 Season/Spring Qualif... | 64 | 202 | 134 | 43 | 57 | 107 | 110 | 13 | 267 | BLUE |
| 1 | 50 | EU Challenger Series/2017 Season/Spring Qualif... | 421 | 81 | 126 | 43 | 98 | 56 | 22 | 134 | 90 | RED |
| 2 | 111 | EU Challenger Series/2017 Season/Spring Qualif... | 107 | 22 | 61 | 201 | 78 | 121 | 81 | 134 | 43 | RED |
| 3 | 78 | EU Challenger Series/2017 Season/Spring Qualif... | 164 | 202 | 61 | 90 | 68 | 421 | 110 | 7 | 412 | BLUE |
| 4 | 78 | EU Challenger Series/2017 Season/Spring Qualif... | 2 | 110 | 134 | 267 | 114 | 421 | 15 | 105 | 43 | BLUE |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35315 | 57 | IEM Season 11/Gyeonggi/Scoreboards/Playoffs_2_3 | 421 | 202 | 134 | 432 | 78 | 60 | 81 | 13 | 43 | RED |
| 35316 | 78 | IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_1 | 60 | 81 | 69 | 201 | 57 | 421 | 15 | 61 | 43 | RED |
| 35317 | 78 | IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_2 | 121 | 22 | 13 | 412 | 111 | 421 | 81 | 61 | 43 | BLUE |
| 35318 | 78 | IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_3 | 121 | 81 | 69 | 143 | 85 | 421 | 202 | 61 | 21 | BLUE |
| 35319 | 57 | IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_4 | 64 | 81 | 112 | 201 | 78 | 121 | 202 | 69 | 43 | BLUE |
35320 rows × 12 columns
'''Quick overlook of the data (inpute, scale, encode and balance!)'''
full_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 35320 entries, 0 to 35319 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TOP_x 35320 non-null int64 1 game_id 35320 non-null object 2 JGL_x 35320 non-null int64 3 BOT_x 35320 non-null int64 4 MID_x 35320 non-null int64 5 SUP_x 35320 non-null int64 6 TOP_y 35320 non-null int64 7 JGL_y 35320 non-null int64 8 BOT_y 35320 non-null int64 9 MID_y 35320 non-null int64 10 SUP_y 35320 non-null int64 11 winner 35320 non-null object dtypes: int64(10), object(2) memory usage: 3.5+ MB
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn import set_config; set_config(display='diagram')
from transformers import SynergyFeature, RoleFeature, ChampionWinrateFeature
# Paralellize column transformers
preproc = ColumnTransformer([
('TOP_rate', RoleFeature('TOP'), ['TOP_x', 'TOP_y']),
('SUP_rate', RoleFeature('SUP'), ['SUP_x', 'SUP_y']),
('MID_rate', RoleFeature('MID'), ['MID_x', 'MID_y']),
('BOT_rate', RoleFeature('BOT'), ['BOT_x', 'BOT_y']),
('JGL_rate', RoleFeature('JGL'), ['JGL_x', 'JGL_y'])
])
#add model
pipe = make_pipeline(preproc, LogisticRegression(solver='liblinear'))
pipe
Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('TOP_rate',
RoleFeature(role='TOP'),
['TOP_x', 'TOP_y']),
('SUP_rate',
RoleFeature(role='SUP'),
['SUP_x', 'SUP_y']),
('MID_rate',
RoleFeature(role='MID'),
['MID_x', 'MID_y']),
('BOT_rate',
RoleFeature(role='BOT'),
['BOT_x', 'BOT_y']),
('JGL_rate',
RoleFeature(role='JGL'),
['JGL_x', 'JGL_y'])])),
('logisticregression', LogisticRegression(solver='liblinear'))])Please rerun this cell to show the HTML repr or trust the notebook.Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('TOP_rate',
RoleFeature(role='TOP'),
['TOP_x', 'TOP_y']),
('SUP_rate',
RoleFeature(role='SUP'),
['SUP_x', 'SUP_y']),
('MID_rate',
RoleFeature(role='MID'),
['MID_x', 'MID_y']),
('BOT_rate',
RoleFeature(role='BOT'),
['BOT_x', 'BOT_y']),
('JGL_rate',
RoleFeature(role='JGL'),
['JGL_x', 'JGL_y'])])),
('logisticregression', LogisticRegression(solver='liblinear'))])ColumnTransformer(transformers=[('TOP_rate', RoleFeature(role='TOP'),
['TOP_x', 'TOP_y']),
('SUP_rate', RoleFeature(role='SUP'),
['SUP_x', 'SUP_y']),
('MID_rate', RoleFeature(role='MID'),
['MID_x', 'MID_y']),
('BOT_rate', RoleFeature(role='BOT'),
['BOT_x', 'BOT_y']),
('JGL_rate', RoleFeature(role='JGL'),
['JGL_x', 'JGL_y'])])['TOP_x', 'TOP_y']
RoleFeature(role='TOP')
['SUP_x', 'SUP_y']
RoleFeature(role='SUP')
['MID_x', 'MID_y']
RoleFeature(role='MID')
['BOT_x', 'BOT_y']
RoleFeature(role='BOT')
['JGL_x', 'JGL_y']
RoleFeature(role='JGL')
LogisticRegression(solver='liblinear')
def Role_DataFrame(df_BLUE, df_RED):
#merge the dataframe of the blue and red teams champions
df_BLUE_RED = pd.merge(left=df_BLUE, right=df_RED, left_on= 'game_id', right_on= 'game_id')
#only keep relevant columns
df_role = df_BLUE_RED[['champion_id_x', 'role_x', 'role_y', 'champion_id_y', 'win_x', 'game_id']]
#times that a given champion played and won or lost against another champion by role
champion_vs_champion = pd.DataFrame(df_role[['champion_id_x',
'role_x', 'role_y', 'champion_id_y', 'win_x']].value_counts())
#times that a given champion played against another champion by role
total_champion_vs_champion = pd.DataFrame(df_BLUE_RED[['champion_id_x',
'role_x', 'role_y', 'champion_id_y']].value_counts())
#percentage that a champion has won or lost against another champion (win(or lost)/ total times played)
rate_champion_vs_champion = champion_vs_champion.div(total_champion_vs_champion)
#save it as a .csv file -> no need to compute everytime!
rate_champion_vs_champion.to_csv('role_winrate_champ_vs_champ.csv')
return rate_champion_vs_champion
#sent the matrix handling to a dedicated function in a utils.py file
from utils import get_synergy, get_vs_rate, get_winrate
from sklearn.base import BaseEstimator, TransformerMixin
class RoleFeature(BaseEstimator, TransformerMixin):
def __init__(self, role):
#get the role winrate champion vs champion DataFrame
rate_champion_vs_champion = pd.read_csv('role_winrate_champ_vs_champ.csv',index_col=[0,1,2,3,4])
self.rate_champion_vs_champion = rate_champion_vs_champion
self.role = role
def fit(self, X=None, y=None):
return self
def transform(self, X, y=None):
#Get the winrate of the same role champions
df = X.apply(lambda z: get_vs_rate(z[0], self.role, z[1], self.rate_champion_vs_champion), axis=1)
return pd.DataFrame(df)
from sklearn.preprocessing import LabelEncoder
y_train = LabelEncoder().fit(full_data.winner).transform(full_data.winner)
X_train = full_data.drop(['game_id', 'winner'], axis=1)
# Train pipeline
pipe.fit(X_train,y_train)
Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('TOP_rate',
RoleFeature(role='TOP'),
['TOP_x', 'TOP_y']),
('SUP_rate',
RoleFeature(role='SUP'),
['SUP_x', 'SUP_y']),
('MID_rate',
RoleFeature(role='MID'),
['MID_x', 'MID_y']),
('BOT_rate',
RoleFeature(role='BOT'),
['BOT_x', 'BOT_y']),
('JGL_rate',
RoleFeature(role='JGL'),
['JGL_x', 'JGL_y'])])),
('logisticregression', LogisticRegression(solver='liblinear'))])Please rerun this cell to show the HTML repr or trust the notebook.Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('TOP_rate',
RoleFeature(role='TOP'),
['TOP_x', 'TOP_y']),
('SUP_rate',
RoleFeature(role='SUP'),
['SUP_x', 'SUP_y']),
('MID_rate',
RoleFeature(role='MID'),
['MID_x', 'MID_y']),
('BOT_rate',
RoleFeature(role='BOT'),
['BOT_x', 'BOT_y']),
('JGL_rate',
RoleFeature(role='JGL'),
['JGL_x', 'JGL_y'])])),
('logisticregression', LogisticRegression(solver='liblinear'))])ColumnTransformer(transformers=[('TOP_rate', RoleFeature(role='TOP'),
['TOP_x', 'TOP_y']),
('SUP_rate', RoleFeature(role='SUP'),
['SUP_x', 'SUP_y']),
('MID_rate', RoleFeature(role='MID'),
['MID_x', 'MID_y']),
('BOT_rate', RoleFeature(role='BOT'),
['BOT_x', 'BOT_y']),
('JGL_rate', RoleFeature(role='JGL'),
['JGL_x', 'JGL_y'])])['TOP_x', 'TOP_y']
RoleFeature(role='TOP')
['SUP_x', 'SUP_y']
RoleFeature(role='SUP')
['MID_x', 'MID_y']
RoleFeature(role='MID')
['BOT_x', 'BOT_y']
RoleFeature(role='BOT')
['JGL_x', 'JGL_y']
RoleFeature(role='JGL')
LogisticRegression(solver='liblinear')
from sklearn.model_selection import cross_val_score
# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=2, scoring='accuracy').mean()
0.5201019252548131